import requests
from bs4 import BeautifulSoup
from url_manage import UrlManage
import re
import pymongo

# 创建客户端
client = pymongo.MongoClient('mongodb://127.0.0.1:27017')

# 指定数据库
crawlerDB = client.crawlerDB

# 指定集合（表）
crazyant_collection  = crawlerDB.crazyant

url = 'http://www.crazyant.net/'

url_manger = UrlManage()

url_manger.add_new_url(url)


while url_manger.has_new_url():
  demo_url = url_manger.get_url()
  r = requests.get(demo_url,timeout=3)
  if r.status_code != 200:
    print('网站访问错误')
  else:
    soup = BeautifulSoup(r.text,'html.parser')
    title = soup.title.string
    crazyant_collection.insert_one({'url':demo_url,'title':title})
    print(demo_url)
    print(title)
    links = soup.find_all('a')
    for link in links:
      if link['href'] == None:
        continue
      else:
        pattern = r'^http://www.crazyant.net/\d+.html$'
        if re.match(pattern,link['href']):
          url_manger.add_new_url(link['href'])


